## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 139280: <<Hive;
## 837c7b84c6e897d17eb2fc156fd5dd25b7468053;Navis Ryu <navis@apache.org>;
## 2015-01-21T07:57:16;"hcatalog/src/test/e2e/templeton/inpdir/
## PigJoin\344\266\264\343\204\251\351\274\276\344\270\204\347\213\234\343\200\207work.pig\"";-4>>.
## If the fields are not quoted (e.g. field separator does not appear within
## any field), try quote="" to avoid this warning.
## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 433308:
## <<MariaDB;e4963e7c3762c495d6b67f855e618c3ff8175b7e;"Reggie"/Reggie@xp. <>;
## 2007-06-07T09:13:56;sql-common/client.c;0>>. If the fields are not quoted
## (e.g. field separator does not appear within any field), try quote="" to
## avoid this warning.
## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 507408:
## <<MySQL;e4963e7c3762c495d6b67f855e618c3ff8175b7e;"Reggie"/Reggie@xp. <>;
## 2007-06-07T09:13:56;sql-common/client.c;0>>. If the fields are not quoted
## (e.g. field separator does not appear within any field), try quote="" to
## avoid this warning.
In der alten Auswertung wurden die Files mit den meisten Commits gezeigt. Da dies über alle Datenbanken nur schwer geht habe ich mich für einen Densitiy Plot entschieden. Später folgt noch eine Auswertung mit der am stärksten vertretenen Programmiersprache. Hier ein Density Plot über die Anzahl an Commits per File in einem Repository
extractedTable_CommitCountByFile <- DBData %>%
group_by(name, file) %>%
summarise(commitCount = n_distinct(commit))
extMean <- extractedTable_CommitCountByFile %>%
group_by(name) %>%
summarise(median = median(commitCount),mean = mean(commitCount))
knitr::kable(extMean)
| name | median | mean |
|---|---|---|
| Cassandra | 3 | 11.156520 |
| Couchbase | 2 | 9.998494 |
| CouchDB | 3 | 7.327409 |
| Elasticsearch | 3 | 5.461175 |
| etcd | 2 | 5.012709 |
| Firebird | 2 | 18.172426 |
| Greenplum | 2 | 7.983630 |
| Hazelcast | 6 | 10.883780 |
| HBase | 3 | 8.481786 |
| Hive | 2 | 6.676108 |
| Impala | 3 | 9.729001 |
| InfluxDB | 3 | 7.764769 |
| MariaDB | 4 | 11.512196 |
| Memcached | 3 | 13.112403 |
| MongoDB | 2 | 4.943705 |
| MySQL | 3 | 8.792365 |
| Neo4j | 4 | 7.778134 |
| PostgreSQL | 6 | 23.587189 |
| Presto | 3 | 8.922513 |
| Realm | 2 | 6.935587 |
| Redis | 3 | 11.604444 |
| Solr | 3 | 6.103026 |
| Spark SQL | 2 | 5.338807 |
| Sphinx | 3 | 11.137621 |
| SQLite | 5 | 42.624477 |
ggplot(extractedTable_CommitCountByFile, aes(x=commitCount,color=name,alpha=0.5))+
geom_density(adjust = 3) +
xlim(0,max(extMean$mean)) +
ggtitle("Density of Commits") +
xlab("CommitCount") +
ylab("Part of")
## Warning: Removed 15422 rows containing non-finite values (stat_density).
ggplot(extractedTable_CommitCountByFile, aes(x=commitCount,color=name,alpha=0.5))+
geom_density(position = "stack",adjust = 3) +
geom_vline(data=extMean,aes(xintercept=mean, color=name),linetype="solid") +
xlim(0,max(extMean$mean)) +
geom_label_repel(data=extMean, mapping=aes(x=mean, y=5, label=name), size=4, angle=90, vjust=-0.4, hjust=0) +
ggtitle("Density of Commits with median as VLine") +
xlab("CommitCount limited bei maxMedianValue") +
ylab("Part of")+ theme(legend.position="none")
## Warning: Removed 15422 rows containing non-finite values (stat_density).
Gruppiert nach Datum werden die eindeutigen CommitIds für diesen Tag aufsummiert
extractedTable <- DBData %>%
group_by(timestampInDate,name) %>%
summarise(commitCount = n_distinct(commit))
Erzeugen einer extra Spalte für Monat und Jahr
extractedTable$month = month(extractedTable$timestampInDate)
extractedTable$year = year(extractedTable$timestampInDate)
Erneutes aufsummieren der Commits pro Monat
extractedTableMonth <- extractedTable %>%
group_by(year,month,name) %>%
summarise(commits = sum(commitCount))%>%
filter(!is.na(year)&!is.na(month))
Weiterer Plot als Monatsübersicht.
labels <- extractedTableMonth %>%
group_by(name)%>%
summarise(min=min(year))
PlotMonthly <-ggplot(extractedTableMonth, aes(y=commits,x=year,month,color=name)) +
geom_smooth(formula = y~x,method ='auto') +
geom_label_repel(data=labels, mapping=aes(label=name,x=min,y=-200)) +
geom_vline(data=labels,aes(xintercept=min, color=name),linetype="solid") +
ggtitle("Commits over Time (Year,Month)") +
xlab("Time") +
ylab("Commits")+ theme(legend.position="none")
Angezeigt wird die Anzahl der Commits pro Monat für alle Repositorys
PlotMonthly
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Hier nun aufgeteilt auf einzelne Jahre mit den einzelnen Datenbanken als Anteil
PlotMonthlyFacet <-
ggplot(extractedTableMonth, aes(y=commits,x=month,fill=name)) +
geom_bar(stat = "identity") +
facet_wrap(~ year, ncol = 5) +ggtitle("Commit over Time (Facet by Year, Columned by Month)") +
xlab("Time") + ylab("Commits")
PlotMonthlyFacet
loc_table <- DBData %>%
group_by(name, file) %>%
summarise(loc = sum(change)) %>%
filter(loc>0)
PlotFileSizeStacked <- ggplot(loc_table, aes(x=loc,fill = name,alpha=0.5)) +
geom_density(position = "stack") +
xlim(0,500)+
ggtitle("Dateigröße als DichtePlot Stacked") +
xlab("Lines of Code") +
ylab("Häufigkeit")
Der Plot einmal als Stacked (Ist bisschen übersichtlicher aber die einzelnen Repos verschwimmen etwas)
PlotFileSizeStacked
## Warning: Removed 26896 rows containing non-finite values (stat_density).
PlotFileSize <- ggplot(loc_table, aes(x=loc,color = name,alpha=0.5)) +
geom_density() +
xlim(0,500)+
ggtitle("Dateigröße als DichtePlot") +
xlab("Lines of Code") +
ylab("Häufigkeit")
Mit den Linien sieht man die einzelnen Plots etwas besser finde ich
PlotFileSize
## Warning: Removed 26896 rows containing non-finite values (stat_density).
Kann man noch etwas besser vergleichen
PlotFileSizeFacedWrap <- ggplot(loc_table, aes(x=loc,fill = name,alpha=0.5)) +
geom_density() +
xlim(0,500)+
facet_wrap(~name)+
ggtitle("Dateigröße als DichtePlot aufgeteilt") +
xlab("Lines of Code") +
ylab("Häufigkeit")
PlotFileSizeFacedWrap
## Warning: Removed 26896 rows containing non-finite values (stat_density).
file_table_mean <- DBData %>%
group_by(name,commit) %>%
summarise(fileSize = sum(change))%>%
summarise(mean = mean(fileSize))
PlotMeanFileSize <- ggplot(file_table_mean, aes(x = reorder(name, -mean),y=mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Durchschnittliche Dateigröße") +
ylab("Lines of Code") +
xlab("Datenbanken")
PlotMeanFileSize
commit_table <- DBData %>%
group_by(name,commit)%>%
summarise(files =n())
PlotCommitSize <- ggplot(commit_table, aes(x=files,color = name,fill=name,alpha=.4)) +
geom_density(adjust = 3) +
xlim(0,15) +
ggtitle("Commitgröße als DichtePlot") +
xlab("Commitgröße") +
ylab("Anzahl Commits")
PlotCommitSize
## Warning: Removed 43883 rows containing non-finite values (stat_density).
PlotCommitSizeFacedWrap <- ggplot(commit_table, aes(x=files,fill = name,alpha=0.5)) +
geom_density(adjust = 3) +
xlim(0,15)+
facet_wrap(~name) +
ggtitle("Commitgröße als DichtePlot") +
xlab("Commitgröße") +
ylab("Häufigkeit")
PlotCommitSizeFacedWrap
## Warning: Removed 43883 rows containing non-finite values (stat_density).
commit_table_mean <- DBData %>%
group_by(name,commit) %>%
summarise(commitSize = n()) %>%
summarise(mean = mean(commitSize))
PlotMeanCommitSize <- ggplot(commit_table_mean, aes(x = reorder(name, -mean),y=mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Durchschnittliche Commitgröße") +
ylab("Commitgröße") +
xlab("Datenbanken")
PlotMeanCommitSize
commit_file_mean <- commit_table_mean %>%
inner_join(file_table_mean,by="name")
PlotCompareCommitFileSize <- ggplot(commit_file_mean, aes(x=mean.x, y=mean.y,color=name)) +
geom_point()+
geom_label_repel(aes(label=name),hjust=-0.1, vjust=1)+
ggtitle("Commitgröße - Filegröße") +
ylab("Filegröße Mean") +
xlab("Commitgröße Mean")+ theme(legend.position="none")
PlotCompareCommitFileSize
change_commit_table <- DBData %>%
group_by(file,programmingLanguage,name) %>%
summarise(sum = sum(abs(change)),commitCnt = n())
PlotCommitCntFileSizeFacetWrap <- ggplot(change_commit_table, aes(x=sum,y=commitCnt,color=name)) +
geom_jitter() +
xlim(0,25000) +
ylim(0,1000) +
facet_wrap(~name)+
ggtitle("Anzahl Commits - Menge Änderungen Pro File") +
xlab("Änderungen in Zeilen") +
ylab("Anzahl Commits")
PlotCommitCntFileSizeFacetWrap
## Warning: Removed 20153 rows containing missing values (geom_point).
corr_table <- change_commit_table %>%
group_by(name) %>%
summarise(Correlation = cor(sum,commitCnt))
PlotCorrelationCommitFile <- ggplot(corr_table, aes(x=reorder(name,-Correlation),y=Correlation)) +
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Korrelation zwischen Anzahl Zeilen und Anzahl Commits") +
ylab("Korrelation") +
xlab("Datanbank")
PlotCorrelationCommitFile
mainProgrammingLanguagesByName <- DBData %>%
group_by(name,programmingLanguage)%>%
filter(programmingLanguage!="unknown")%>%
summarise(cnt = n())%>%
arrange(name,desc(cnt))%>%
group_by(name)%>%
summarise(mainLang = first(programmingLanguage))
knitr::kable(mainProgrammingLanguagesByName)
| name | mainLang |
|---|---|
| Cassandra | Java |
| Couchbase | Ruby |
| CouchDB | Erlang |
| Elasticsearch | Java |
| etcd | Go |
| Firebird | C |
| Greenplum | C |
| Hazelcast | Java |
| HBase | Java |
| Hive | Java |
| Impala | C++ |
| InfluxDB | Go |
| MariaDB | C |
| Memcached | C |
| MongoDB | C++ |
| MySQL | C++ |
| Neo4j | Java |
| PostgreSQL | C |
| Presto | Java |
| Realm | Objective-C++ |
| Redis | C |
| Solr | Java |
| Spark SQL | Scala |
| Sphinx | C++ |
| SQLite | C |
DBDataFilteredByMainLang <- DBData %>%
inner_join(mainProgrammingLanguagesByName,by="name") %>%
filter(programmingLanguage==mainLang)
commit_table_mean_main <- DBDataFilteredByMainLang %>%
group_by(name,commit) %>%
summarise(commitSize = n()) %>%
summarise(mean = mean(commitSize))
PlotMeanCommitSizeMainLanguage <- ggplot(commit_table_mean_main, aes(x = reorder(name, -mean),y=mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Durchschnittliche Commitgröße MainLanguage") +
ylab("Commitgröße") +
xlab("Datenbanken")
file_table_mean_main <- DBDataFilteredByMainLang %>%
group_by(name,commit) %>%
summarise(fileSize = sum(change))%>%
summarise(mean = mean(fileSize))
PlotMeanFileSizeMainLanguage <- ggplot(file_table_mean_main, aes(x = reorder(name, -mean),y=mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Durchschnittliche Dateigröße MainLanguage") +
ylab("Lines of Code") +
xlab("Datenbanken")
commit_file_mean_main <- commit_table_mean_main %>%
inner_join(file_table_mean_main,by="name")
PlotCompareCommitFileSizeMainLanguage <- ggplot(commit_file_mean_main, aes(x=mean.x, y=mean.y,color=name)) +
geom_point()+
geom_label_repel(aes(label=name),hjust=-0.1, vjust=1)+
ggtitle("Commitgröße - Filegröße MainLanguage") +
ylab("Filegröße Mean") +
xlab("Commitgröße Mean")+ theme(legend.position="none")
change_commit_table_main <- DBDataFilteredByMainLang %>%
group_by(file,programmingLanguage,name) %>%
summarise(sum = sum(abs(change)),commitCnt = n())
PlotCommitCntFileSizeFacetWrapMainLanguage <- ggplot(change_commit_table_main, aes(x=sum,y=commitCnt,color=name)) +
geom_jitter() +
xlim(0,25000) +
ylim(0,1000) +
facet_wrap(~name)+
ggtitle("Anzahl Commits - Menge Änderungen MainLanguage") +
xlab("Änderungen in Zeilen") +
ylab("Anzahl Commits")
corr_table_main <- change_commit_table_main %>%
group_by(name) %>%
summarise(Correlation = cor(sum,commitCnt))
PlotCorrelationCommitFileMainLanguage <- ggplot(corr_table_main, aes(x=reorder(name,-Correlation),y=Correlation)) +
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90))+
ggtitle("Korrelation zwischen Anzahl Zeilen und Anzahl Commits MainLanguage") +
ylab("Korrelation") +
xlab("Datanbank")
require(gridExtra)
grid.arrange(PlotMeanCommitSizeMainLanguage, PlotMeanCommitSize, ncol=2)
grid.arrange(PlotMeanFileSizeMainLanguage, PlotMeanFileSize, ncol=2)
grid.arrange(PlotCompareCommitFileSizeMainLanguage, PlotCompareCommitFileSize, ncol=2)
grid.arrange(PlotCommitCntFileSizeFacetWrapMainLanguage, PlotCommitCntFileSizeFacetWrap, ncol=2)
## Warning: Removed 330 rows containing missing values (geom_point).
## Warning: Removed 20287 rows containing missing values (geom_point).
grid.arrange(PlotCorrelationCommitFileMainLanguage, PlotCorrelationCommitFile, ncol=2)